BigQuery GENERATE_TIMESTAMP_ARRAY でバケットを作って集計
GENERATE_ARRAY(start_expression, end_expression[, step_expression])
GENERATE_DATE_ARRAY(start_date, end_date[, INTERVAL INT64_expr date_part])
GENERATE_TIMESTAMP_ARRAY(start_timestamp, end_timestamp, INTERVAL step_expression date_part)
code:bucket.sql
WITH data AS (
# 集計対象のデータ
SELECT * FROM UNNEST(
ARRAY<STRUCT<count INT64, timestamp TIMESTAMP>>[
STRUCT(3, TIMESTAMP "2020-07-01 12:34:50"),
STRUCT(5, TIMESTAMP "2020-07-02 10:30:00"),
STRUCT(7, TIMESTAMP "2020-07-04 21:07:10"),
STRUCT(11, TIMESTAMP "2020-07-04 21:10:00")
])
), buckets AS (
# 時間バケット
SELECT * FROM UNNEST(
GENERATE_TIMESTAMP_ARRAY(
"2020-07-01 00:00:00", "2020-07-04 23:59:59",
INTERVAL 1 HOUR
)
) AS bucket
)
SELECT
bucket,
IFNULL(SUM(count), 0) as count
FROM buckets
LEFT JOIN data
ON buckets.bucket = TIMESTAMP_TRUNC(data.timestamp, HOUR)
GROUP BY bucket
15分単位など
GENERATE_TIMESTAMP_ARRAY は INTERVAL で 15 分単位や 30 分単位の数列を作れる
TIMESTAMP_TRUNC は HOUR や DAY などのユニット単位でしかできない
それ以下で丸めたいなら
TIMESTAMP_SECONDS(DIV(UNIX_SECONDS(timestamp), 15*60) * 15*60)
TIMESTAMP の値域は 0001-01-01 00:00:00~9999-12-31 23:59:59.999999 UTC に対し、UNIX_SECONDS は1970-01-01 00:00:00 以降の値なのでそこは注意
引数にする
よく書く
code:with_params.sql
WITH params AS (
SELECT
DATE_SUB(CURRENT_DATE("Asia/Tokyo"), INTERVAL 3 DAY) AS until,
DATE_SUB(CURRENT_DATE("Asia/Tokyo"), INTERVAL 365+3 DAY) AS since,
), date_bucket AS (
SELECT jst_date FROM params, UNNEST(GENERATE_DATE_ARRAY(params.since, params.until)) AS jst_date
)
SELECT * FROM date_bucket
時間の範囲を作る
code:generate_with_range.sql
SELECT
ts,
TIMESTAMP_SUB(ts, INTERVAL 8 DAY) AS since,
TIMESTAMP_SUB(ts, INTERVAL 1 DAY) AS until
FROM UNNEST (
GENERATE_TIMESTAMP_ARRAY(
TIMESTAMP("2022-02-20 10:00:00", "Asia/Tokyo"),
TIMESTAMP("2022-03-05 10:00:00", "Asia/Tokyo"),
INTERVAL 1 DAY
)
) AS ts